I. Setting up the Problem



In [111]:

    
import pandas as pd
import numpy as np
from IPython.display import Image
import matplotlib.pyplot as plt

# Import the random forest package
from sklearn.ensemble import RandomForestClassifier 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score



In [112]:

    
filename ="CrowdstormingDataJuly1st.csv"
Data = pd.read_csv(filename)

1) Peeking into the Data



In [113]:

    
Data.ix[:10,:13]









    Out[113]:






  
    
      
      playerShort
      player
      club
      leagueCountry
      birthday
      height
      weight
      position
      games
      victories
      ties
      defeats
      goals
    
  
  
    
      0
      lucas-wilchez
      Lucas Wilchez
      Real Zaragoza
      Spain
      31.08.1983
      177.0
      72.0
      Attacking Midfielder
      1
      0
      0
      1
      0
    
    
      1
      john-utaka
      John Utaka
      Montpellier HSC
      France
      08.01.1982
      179.0
      82.0
      Right Winger
      1
      0
      0
      1
      0
    
    
      2
      abdon-prats
      Abdón Prats
      RCD Mallorca
      Spain
      17.12.1992
      181.0
      79.0
      NaN
      1
      0
      1
      0
      0
    
    
      3
      pablo-mari
      Pablo Marí
      RCD Mallorca
      Spain
      31.08.1993
      191.0
      87.0
      Center Back
      1
      1
      0
      0
      0
    
    
      4
      ruben-pena
      Rubén Peña
      Real Valladolid
      Spain
      18.07.1991
      172.0
      70.0
      Right Midfielder
      1
      1
      0
      0
      0
    
    
      5
      aaron-hughes
      Aaron Hughes
      Fulham FC
      England
      08.11.1979
      182.0
      71.0
      Center Back
      1
      0
      0
      1
      0
    
    
      6
      aleksandar-kolarov
      Aleksandar Kolarov
      Manchester City
      England
      10.11.1985
      187.0
      80.0
      Left Fullback
      1
      1
      0
      0
      0
    
    
      7
      alexander-tettey
      Alexander Tettey
      Norwich City
      England
      04.04.1986
      180.0
      68.0
      Defensive Midfielder
      1
      0
      0
      1
      0
    
    
      8
      anders-lindegaard
      Anders Lindegaard
      Manchester United
      England
      13.04.1984
      193.0
      80.0
      Goalkeeper
      1
      0
      1
      0
      0
    
    
      9
      andreas-beck
      Andreas Beck
      1899 Hoffenheim
      Germany
      13.03.1987
      180.0
      70.0
      Right Fullback
      1
      1
      0
      0
      0
    
    
      10
      antonio-rukavina
      Antonio Rukavina
      Real Valladolid
      Spain
      26.01.1984
      177.0
      74.0
      Right Fullback
      2
      2
      0
      0
      0



In [114]:

    
Data.ix[:10,13:28]









    Out[114]:






  
    
      
      yellowCards
      yellowReds
      redCards
      photoID
      rater1
      rater2
      refNum
      refCountry
      Alpha_3
      meanIAT
      nIAT
      seIAT
      meanExp
      nExp
      seExp
    
  
  
    
      0
      0
      0
      0
      95212.jpg
      0.25
      0.50
      1
      1
      GRC
      0.326391
      712.0
      0.000564
      0.396000
      750.0
      0.002696
    
    
      1
      1
      0
      0
      1663.jpg
      0.75
      0.75
      2
      2
      ZMB
      0.203375
      40.0
      0.010875
      -0.204082
      49.0
      0.061504
    
    
      2
      1
      0
      0
      NaN
      NaN
      NaN
      3
      3
      ESP
      0.369894
      1785.0
      0.000229
      0.588297
      1897.0
      0.001002
    
    
      3
      0
      0
      0
      NaN
      NaN
      NaN
      3
      3
      ESP
      0.369894
      1785.0
      0.000229
      0.588297
      1897.0
      0.001002
    
    
      4
      0
      0
      0
      NaN
      NaN
      NaN
      3
      3
      ESP
      0.369894
      1785.0
      0.000229
      0.588297
      1897.0
      0.001002
    
    
      5
      0
      0
      0
      3868.jpg
      0.25
      0.00
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
    
    
      6
      0
      0
      0
      47704.jpg
      0.00
      0.25
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
    
    
      7
      0
      0
      0
      22356.jpg
      1.00
      1.00
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
    
    
      8
      0
      0
      0
      16528.jpg
      0.25
      0.25
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
    
    
      9
      0
      0
      0
      36499.jpg
      0.00
      0.00
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
    
    
      10
      1
      0
      0
      59786.jpg
      0.00
      0.00
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752

II. Preparing data

1) Keep only players that have a Rater Image



In [115]:

    
# Remove the players without rater 1 / 2 (ie: without photo) because we won't be 
# able to train or test the values (this can be done as bonus later)

Data_hasImage = Data[pd.notnull(Data['photoID'])]

2) Getting rif of referees and grouping data by soccer player

We need to aggregate the information about referees and group the result by soccer player. It means that each line will correspond to a soccer player, with the sum of all the cards he got, and we won't know anymore who gaves the cards.



In [116]:

    
# Group by player and do the sum of every column, except for mean_rater (skin color) that we need to move away during the calculation (we don't want to sum skin color values !)
Data_aggregated = Data_hasImage.drop(['refNum', 'refCountry'], 1)
Data_aggregated = Data_aggregated.groupby(['playerShort', 'position'])['games','yellowCards', 'yellowReds', 'redCards'].sum()
Data_aggregated = Data_aggregated.reset_index()

# Take information of skin color for each player
Data_nbGames_skinColor = Data_hasImage
Data_nbGames_skinColor.drop_duplicates('playerShort')
Data_nbGames_skinColor['skinColor']=(Data_nbGames_skinColor['rater1']+Data_hasImage['rater2'])/2
Data_nbGames_skinColor = pd.DataFrame(Data_nbGames_skinColor[['playerShort','skinColor']])
Data_aggregated = pd.merge(left=Data_aggregated,right=Data_nbGames_skinColor, how='left', left_on='playerShort', right_on='playerShort')
Data_aggregated = Data_aggregated.drop_duplicates('playerShort')
Data_aggregated = Data_aggregated.reset_index(drop=True)
Data_aggregated









    



C:\Users\merin\Anaconda3\lib\site-packages\ipykernel\__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    Out[116]:






  
    
      
      playerShort
      position
      games
      yellowCards
      yellowReds
      redCards
      skinColor
    
  
  
    
      0
      aaron-hughes
      Center Back
      654
      19
      0
      0
      0.125
    
    
      1
      aaron-hunt
      Attacking Midfielder
      336
      42
      0
      1
      0.125
    
    
      2
      aaron-lennon
      Right Midfielder
      412
      11
      0
      0
      0.250
    
    
      3
      aaron-ramsey
      Center Midfielder
      260
      31
      0
      1
      0.000
    
    
      4
      abdelhamid-el-kaoutari
      Center Back
      124
      8
      4
      2
      0.250
    
    
      5
      abdou-traore_2
      Right Midfielder
      97
      11
      1
      0
      0.750
    
    
      6
      abdoulaye-diallo_2
      Goalkeeper
      24
      0
      0
      0
      0.875
    
    
      7
      abdoulaye-keita_2
      Goalkeeper
      3
      0
      0
      0
      0.875
    
    
      8
      abdoulwhaid-sissoko
      Defensive Midfielder
      121
      21
      0
      2
      1.000
    
    
      9
      abdul-rahman-baba
      Left Fullback
      50
      3
      0
      1
      0.875
    
    
      10
      abdul-razak
      Center Midfielder
      36
      2
      0
      0
      1.000
    
    
      11
      abel-aguilar
      Defensive Midfielder
      246
      70
      3
      2
      0.375
    
    
      12
      abou-diaby
      Center Midfielder
      208
      24
      0
      2
      0.750
    
    
      13
      adam-bodzek
      Defensive Midfielder
      211
      66
      1
      0
      0.250
    
    
      14
      adam-federici
      Goalkeeper
      206
      4
      0
      0
      0.000
    
    
      15
      adam-hlousek
      Left Midfielder
      131
      21
      0
      0
      0.000
    
    
      16
      adam-johnson
      Left Midfielder
      339
      21
      0
      0
      0.000
    
    
      17
      adam-pinter
      Center Back
      82
      14
      1
      1
      0.000
    
    
      18
      adam-smith_3
      Right Fullback
      131
      29
      2
      1
      0.000
    
    
      19
      adam-szalai
      Center Forward
      182
      18
      1
      0
      0.250
    
    
      20
      adan
      Goalkeeper
      44
      1
      0
      1
      0.000
    
    
      21
      adel-taarabt
      Left Midfielder
      209
      23
      1
      0
      0.250
    
    
      22
      adil-rami
      Center Back
      286
      59
      2
      3
      0.125
    
    
      23
      adrian-colunga
      Center Forward
      178
      22
      1
      0
      0.250
    
    
      24
      adrian-mutu
      Left Winger
      453
      90
      2
      3
      0.250
    
    
      25
      adrian_2
      Center Forward
      294
      16
      0
      0
      0.125
    
    
      26
      adrian_7
      Goalkeeper
      29
      3
      0
      0
      0.250
    
    
      27
      adriano_24
      Left Midfielder
      373
      58
      1
      3
      0.250
    
    
      28
      adrien-rabiot
      Defensive Midfielder
      66
      11
      0
      0
      0.000
    
    
      29
      aduriz
      Center Forward
      322
      68
      3
      4
      0.250
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1403
      xherdan-shaqiri
      Left Midfielder
      224
      23
      0
      1
      0.250
    
    
      1404
      xisco_2
      Center Forward
      166
      26
      0
      0
      0.250
    
    
      1405
      yacine-brahimi
      Attacking Midfielder
      149
      20
      2
      0
      0.625
    
    
      1406
      yann-mvila
      Defensive Midfielder
      203
      29
      1
      1
      0.500
    
    
      1407
      yannick-djalo
      Center Forward
      167
      10
      0
      0
      0.750
    
    
      1408
      yannik-schulze
      Center Back
      49
      8
      0
      0
      0.000
    
    
      1409
      yassine-el-ghanassi
      Left Winger
      181
      15
      0
      0
      0.500
    
    
      1410
      yassine-jebbour
      Left Fullback
      54
      2
      1
      2
      0.625
    
    
      1411
      yaya-toure
      Defensive Midfielder
      448
      82
      0
      1
      1.000
    
    
      1412
      yoan-gouffran
      Right Winger
      269
      18
      0
      1
      0.500
    
    
      1413
      yoann-gourcuff
      Attacking Midfielder
      341
      31
      0
      2
      0.125
    
    
      1414
      yohan-cabaye
      Defensive Midfielder
      372
      86
      1
      1
      0.000
    
    
      1415
      yohandry-orozco
      Attacking Midfielder
      67
      4
      0
      0
      0.500
    
    
      1416
      yossi-benayoun
      Attacking Midfielder
      456
      31
      1
      0
      0.250
    
    
      1417
      younes-belhanda
      Attacking Midfielder
      173
      32
      2
      3
      0.250
    
    
      1418
      younes-kaboul
      Center Back
      252
      25
      4
      2
      0.500
    
    
      1419
      youssef-el-arabi
      Center Forward
      159
      15
      0
      0
      0.625
    
    
      1420
      yunus-malli
      Attacking Midfielder
      131
      4
      0
      0
      0.125
    
    
      1421
      zdenk-pospch
      Right Fullback
      297
      26
      1
      0
      0.125
    
    
      1422
      zdravko-kuzmanovic
      Defensive Midfielder
      339
      42
      1
      1
      0.000
    
    
      1423
      ze-castro
      Center Back
      169
      28
      1
      1
      0.250
    
    
      1424
      zhi-gin-lam
      Right Fullback
      127
      8
      0
      0
      0.250
    
    
      1425
      zlatan-alomerovic
      Goalkeeper
      111
      10
      0
      1
      0.000
    
    
      1426
      zlatan-ibrahimovic
      Center Forward
      607
      94
      4
      6
      0.250
    
    
      1427
      zlatko-junuzovic
      Attacking Midfielder
      361
      53
      1
      0
      0.125
    
    
      1428
      zoltan-gera
      Left Winger
      392
      44
      1
      1
      0.250
    
    
      1429
      zoltan-stieber
      Left Midfielder
      142
      12
      0
      0
      0.000
    
    
      1430
      zoumana-camara
      Center Back
      395
      46
      2
      6
      0.875
    
    
      1431
      zubikarai
      Goalkeeper
      47
      2
      0
      2
      0.000
    
    
      1432
      zurutuza
      Defensive Midfielder
      160
      22
      0
      0
      0.000
    
  

1433 rows × 7 columns

III. Unsupervized machine learning

The first idea we got is to start an unsupervized learning kept as simple as possible.

We will have to take player position, the three types of cards and the skin color: that makes 5 dimensions to deal with !

Instead, let say we only look at the total number of cards the players got, and their skin color. Then we would be able to display something in 2 dimensions only:

Then, we would try to obtain two clusters that might lead to really simple conclusion such as "dark people slightly tend to get more cards":

Again, this is totally hypothetical. So let's give it a try. We try to use a K means clustering methode to obtain 2 distinct clusters, with the help of this website: http://stamfordresearch.com/k-means-clustering-in-python/



In [117]:

    
# Input
x = Data_aggregated
x = x.drop(['playerShort'], 1)

# We have to convert every columns to floats, to be able to train our model
mapping = {'Center Back': 1, 'Attacking Midfielder': 2, 'Right Midfielder': 3, 'Center Midfielder': 4, 'Defensive Midfielder': 5, 'Goalkeeper':6, 'Left Fullback':7, 'Left Midfielder':8, 'Right Fullback':9, 'Center Forward':10, 'Left Winger':11, 'Right Winger':12}
x = x.replace({'position': mapping})
x









    Out[117]:






  
    
      
      position
      games
      yellowCards
      yellowReds
      redCards
      skinColor
    
  
  
    
      0
      1
      654
      19
      0
      0
      0.125
    
    
      1
      2
      336
      42
      0
      1
      0.125
    
    
      2
      3
      412
      11
      0
      0
      0.250
    
    
      3
      4
      260
      31
      0
      1
      0.000
    
    
      4
      1
      124
      8
      4
      2
      0.250
    
    
      5
      3
      97
      11
      1
      0
      0.750
    
    
      6
      6
      24
      0
      0
      0
      0.875
    
    
      7
      6
      3
      0
      0
      0
      0.875
    
    
      8
      5
      121
      21
      0
      2
      1.000
    
    
      9
      7
      50
      3
      0
      1
      0.875
    
    
      10
      4
      36
      2
      0
      0
      1.000
    
    
      11
      5
      246
      70
      3
      2
      0.375
    
    
      12
      4
      208
      24
      0
      2
      0.750
    
    
      13
      5
      211
      66
      1
      0
      0.250
    
    
      14
      6
      206
      4
      0
      0
      0.000
    
    
      15
      8
      131
      21
      0
      0
      0.000
    
    
      16
      8
      339
      21
      0
      0
      0.000
    
    
      17
      1
      82
      14
      1
      1
      0.000
    
    
      18
      9
      131
      29
      2
      1
      0.000
    
    
      19
      10
      182
      18
      1
      0
      0.250
    
    
      20
      6
      44
      1
      0
      1
      0.000
    
    
      21
      8
      209
      23
      1
      0
      0.250
    
    
      22
      1
      286
      59
      2
      3
      0.125
    
    
      23
      10
      178
      22
      1
      0
      0.250
    
    
      24
      11
      453
      90
      2
      3
      0.250
    
    
      25
      10
      294
      16
      0
      0
      0.125
    
    
      26
      6
      29
      3
      0
      0
      0.250
    
    
      27
      8
      373
      58
      1
      3
      0.250
    
    
      28
      5
      66
      11
      0
      0
      0.000
    
    
      29
      10
      322
      68
      3
      4
      0.250
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1403
      8
      224
      23
      0
      1
      0.250
    
    
      1404
      10
      166
      26
      0
      0
      0.250
    
    
      1405
      2
      149
      20
      2
      0
      0.625
    
    
      1406
      5
      203
      29
      1
      1
      0.500
    
    
      1407
      10
      167
      10
      0
      0
      0.750
    
    
      1408
      1
      49
      8
      0
      0
      0.000
    
    
      1409
      11
      181
      15
      0
      0
      0.500
    
    
      1410
      7
      54
      2
      1
      2
      0.625
    
    
      1411
      5
      448
      82
      0
      1
      1.000
    
    
      1412
      12
      269
      18
      0
      1
      0.500
    
    
      1413
      2
      341
      31
      0
      2
      0.125
    
    
      1414
      5
      372
      86
      1
      1
      0.000
    
    
      1415
      2
      67
      4
      0
      0
      0.500
    
    
      1416
      2
      456
      31
      1
      0
      0.250
    
    
      1417
      2
      173
      32
      2
      3
      0.250
    
    
      1418
      1
      252
      25
      4
      2
      0.500
    
    
      1419
      10
      159
      15
      0
      0
      0.625
    
    
      1420
      2
      131
      4
      0
      0
      0.125
    
    
      1421
      9
      297
      26
      1
      0
      0.125
    
    
      1422
      5
      339
      42
      1
      1
      0.000
    
    
      1423
      1
      169
      28
      1
      1
      0.250
    
    
      1424
      9
      127
      8
      0
      0
      0.250
    
    
      1425
      6
      111
      10
      0
      1
      0.000
    
    
      1426
      10
      607
      94
      4
      6
      0.250
    
    
      1427
      2
      361
      53
      1
      0
      0.125
    
    
      1428
      11
      392
      44
      1
      1
      0.250
    
    
      1429
      8
      142
      12
      0
      0
      0.000
    
    
      1430
      1
      395
      46
      2
      6
      0.875
    
    
      1431
      6
      47
      2
      0
      2
      0.000
    
    
      1432
      5
      160
      22
      0
      0
      0.000
    
  

1433 rows × 6 columns



In [118]:

    
# Output with the same length as the input, that will contains the associated cluster
y = pd.DataFrame(index=x.index, columns=['targetCluster'])
y.head()









    Out[118]:






  
    
      
      targetCluster
    
  
  
    
      0
      NaN
    
    
      1
      NaN
    
    
      2
      NaN
    
    
      3
      NaN
    
    
      4
      NaN



In [119]:

    
# K Means Cluster
model = KMeans(n_clusters=2)
model = model.fit(x)
model









    Out[119]:





KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)



In [120]:

    
# We got a model with two clusters
model.labels_









    Out[120]:





array([0, 0, 0, ..., 0, 1, 1])



In [121]:

    
# View the results
# Set the size of the plot
plt.figure(figsize=(14,7))
 
# Create a colormap for the two clusters
colormap = np.array(['blue', 'lime'])
 
# Plot the Model Classification PARTIALLY
plt.scatter((0.5*x.yellowCards + x.yellowReds + x.redCards)/x.games, x.skinColor, c=colormap[model.labels_], s=40)
plt.xlabel('Red cards per game (yellow = half a red card)')
plt.ylabel('Skin color')
plt.title('K Mean Classification')
plt.show()

(We show only skin color and number of "red cards" because it's a 2D plot, but we actually used 5 parameters: position, yellowCards, yellowReds, redCards and number of games. So this graph doesn't really represent how our data has been clustered. This is only to check if some clustering has ben done. Here we don't really see two distincts clusters. It looks like more random coloring ! :x

Now, let's add the result to each player:



In [122]:

    
cluster = pd.DataFrame(pd.Series(model.labels_, name='cluster'))
Data_Clustered = Data_aggregated
Data_Clustered['cluster'] = cluster
Data_Clustered









    Out[122]:






  
    
      
      playerShort
      position
      games
      yellowCards
      yellowReds
      redCards
      skinColor
      cluster
    
  
  
    
      0
      aaron-hughes
      Center Back
      654
      19
      0
      0
      0.125
      0
    
    
      1
      aaron-hunt
      Attacking Midfielder
      336
      42
      0
      1
      0.125
      0
    
    
      2
      aaron-lennon
      Right Midfielder
      412
      11
      0
      0
      0.250
      0
    
    
      3
      aaron-ramsey
      Center Midfielder
      260
      31
      0
      1
      0.000
      1
    
    
      4
      abdelhamid-el-kaoutari
      Center Back
      124
      8
      4
      2
      0.250
      1
    
    
      5
      abdou-traore_2
      Right Midfielder
      97
      11
      1
      0
      0.750
      1
    
    
      6
      abdoulaye-diallo_2
      Goalkeeper
      24
      0
      0
      0
      0.875
      1
    
    
      7
      abdoulaye-keita_2
      Goalkeeper
      3
      0
      0
      0
      0.875
      1
    
    
      8
      abdoulwhaid-sissoko
      Defensive Midfielder
      121
      21
      0
      2
      1.000
      1
    
    
      9
      abdul-rahman-baba
      Left Fullback
      50
      3
      0
      1
      0.875
      1
    
    
      10
      abdul-razak
      Center Midfielder
      36
      2
      0
      0
      1.000
      1
    
    
      11
      abel-aguilar
      Defensive Midfielder
      246
      70
      3
      2
      0.375
      1
    
    
      12
      abou-diaby
      Center Midfielder
      208
      24
      0
      2
      0.750
      1
    
    
      13
      adam-bodzek
      Defensive Midfielder
      211
      66
      1
      0
      0.250
      1
    
    
      14
      adam-federici
      Goalkeeper
      206
      4
      0
      0
      0.000
      1
    
    
      15
      adam-hlousek
      Left Midfielder
      131
      21
      0
      0
      0.000
      1
    
    
      16
      adam-johnson
      Left Midfielder
      339
      21
      0
      0
      0.000
      0
    
    
      17
      adam-pinter
      Center Back
      82
      14
      1
      1
      0.000
      1
    
    
      18
      adam-smith_3
      Right Fullback
      131
      29
      2
      1
      0.000
      1
    
    
      19
      adam-szalai
      Center Forward
      182
      18
      1
      0
      0.250
      1
    
    
      20
      adan
      Goalkeeper
      44
      1
      0
      1
      0.000
      1
    
    
      21
      adel-taarabt
      Left Midfielder
      209
      23
      1
      0
      0.250
      1
    
    
      22
      adil-rami
      Center Back
      286
      59
      2
      3
      0.125
      0
    
    
      23
      adrian-colunga
      Center Forward
      178
      22
      1
      0
      0.250
      1
    
    
      24
      adrian-mutu
      Left Winger
      453
      90
      2
      3
      0.250
      0
    
    
      25
      adrian_2
      Center Forward
      294
      16
      0
      0
      0.125
      0
    
    
      26
      adrian_7
      Goalkeeper
      29
      3
      0
      0
      0.250
      1
    
    
      27
      adriano_24
      Left Midfielder
      373
      58
      1
      3
      0.250
      0
    
    
      28
      adrien-rabiot
      Defensive Midfielder
      66
      11
      0
      0
      0.000
      1
    
    
      29
      aduriz
      Center Forward
      322
      68
      3
      4
      0.250
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1403
      xherdan-shaqiri
      Left Midfielder
      224
      23
      0
      1
      0.250
      1
    
    
      1404
      xisco_2
      Center Forward
      166
      26
      0
      0
      0.250
      1
    
    
      1405
      yacine-brahimi
      Attacking Midfielder
      149
      20
      2
      0
      0.625
      1
    
    
      1406
      yann-mvila
      Defensive Midfielder
      203
      29
      1
      1
      0.500
      1
    
    
      1407
      yannick-djalo
      Center Forward
      167
      10
      0
      0
      0.750
      1
    
    
      1408
      yannik-schulze
      Center Back
      49
      8
      0
      0
      0.000
      1
    
    
      1409
      yassine-el-ghanassi
      Left Winger
      181
      15
      0
      0
      0.500
      1
    
    
      1410
      yassine-jebbour
      Left Fullback
      54
      2
      1
      2
      0.625
      1
    
    
      1411
      yaya-toure
      Defensive Midfielder
      448
      82
      0
      1
      1.000
      0
    
    
      1412
      yoan-gouffran
      Right Winger
      269
      18
      0
      1
      0.500
      1
    
    
      1413
      yoann-gourcuff
      Attacking Midfielder
      341
      31
      0
      2
      0.125
      0
    
    
      1414
      yohan-cabaye
      Defensive Midfielder
      372
      86
      1
      1
      0.000
      0
    
    
      1415
      yohandry-orozco
      Attacking Midfielder
      67
      4
      0
      0
      0.500
      1
    
    
      1416
      yossi-benayoun
      Attacking Midfielder
      456
      31
      1
      0
      0.250
      0
    
    
      1417
      younes-belhanda
      Attacking Midfielder
      173
      32
      2
      3
      0.250
      1
    
    
      1418
      younes-kaboul
      Center Back
      252
      25
      4
      2
      0.500
      1
    
    
      1419
      youssef-el-arabi
      Center Forward
      159
      15
      0
      0
      0.625
      1
    
    
      1420
      yunus-malli
      Attacking Midfielder
      131
      4
      0
      0
      0.125
      1
    
    
      1421
      zdenk-pospch
      Right Fullback
      297
      26
      1
      0
      0.125
      0
    
    
      1422
      zdravko-kuzmanovic
      Defensive Midfielder
      339
      42
      1
      1
      0.000
      0
    
    
      1423
      ze-castro
      Center Back
      169
      28
      1
      1
      0.250
      1
    
    
      1424
      zhi-gin-lam
      Right Fullback
      127
      8
      0
      0
      0.250
      1
    
    
      1425
      zlatan-alomerovic
      Goalkeeper
      111
      10
      0
      1
      0.000
      1
    
    
      1426
      zlatan-ibrahimovic
      Center Forward
      607
      94
      4
      6
      0.250
      0
    
    
      1427
      zlatko-junuzovic
      Attacking Midfielder
      361
      53
      1
      0
      0.125
      0
    
    
      1428
      zoltan-gera
      Left Winger
      392
      44
      1
      1
      0.250
      0
    
    
      1429
      zoltan-stieber
      Left Midfielder
      142
      12
      0
      0
      0.000
      1
    
    
      1430
      zoumana-camara
      Center Back
      395
      46
      2
      6
      0.875
      0
    
    
      1431
      zubikarai
      Goalkeeper
      47
      2
      0
      2
      0.000
      1
    
    
      1432
      zurutuza
      Defensive Midfielder
      160
      22
      0
      0
      0.000
      1
    
  

1433 rows × 8 columns

So, do we have any new information ? What can we conclude of this ? We can use the "silhouette score", which is a metric showing if the two clusters are well separated. It it's equals to 1, the clusters are perfectly separated, and if it's 0, the clustering doesn't make any sense.



In [131]:

    
score = silhouette_score(x, model.labels_)
score









    Out[131]:





0.57982765748640508

We got a silhouette score of 58%, which is honestly not enough to predict precisely the skin color of new players. A value closer to +1 would have indicated with higher confidence a difference between the clusters. 60% is enough to distinguish the two clusters but, still, we cannot rely on this model. Let's try to remove features iterately, starting with skin color.



In [130]:

    
x_noSkinColor = x.drop(['skinColor'], 1)
model = KMeans(n_clusters=2)
model = model.fit(x_noSkinColor)
score_noSkinColor = silhouette_score(x_noSkinColor, model.labels_)
score_noSkinColor









    Out[130]:





0.5798341374557211



In [134]:

    
score_noSkinColor / score









    Out[134]:





1.000011175681657

Seems like removing skin color from the input didn't change anything for the clustering performance ! Let's do this with removing another parameter: position.



In [142]:

    
x_noPosition = x.drop(['position'], 1)
model = KMeans(n_clusters=2)
model = model.fit(x_noPosition)
score_noPosition= silhouette_score(x_noPosition, model.labels_)
score_noPosition









    Out[142]:





0.58072547074299929



In [143]:

    
score_noPosition / score









    Out[143]:





1.0015484139899196

Player position doesn't have much impact either. We can try to remove the number of games, but it won't make sense: some player will have an absolute higher number of cards, only because they played a lot more games. But we will lost this information.



In [144]:

    
x_noGameNumber = x.drop(['games'], 1)
model = KMeans(n_clusters=2)
model = model.fit(x_noGameNumber)
score_noGameNumber = silhouette_score(x_noGameNumber, model.labels_)
score_noGameNumber









    Out[144]:





0.59054132887863631



In [145]:

    
score_noGameNumber / score









    Out[145]:





1.0184773376259348

Well, that makes a 2% improvement, but the information is biased ! This model doesn't show anything helpful. Whatever feature we remove, we don't get a good prediction with unsupervized learning. It doesn't mean that there is absolutely zero correlation between the skin color and the number of cards a players can get. But we're not able to predict correctly the skin color of a player, according to the different features we studied previously.

	playerShort	player	club	leagueCountry	birthday	height	weight	position	games	victories	ties	defeats
0	lucas-wilchez	Lucas Wilchez	Real Zaragoza	Spain	31.08.1983	177.0	72.0	Attacking Midfielder	1	0	0	1
1	john-utaka	John Utaka	Montpellier HSC	France	08.01.1982	179.0	82.0	Right Winger	1	0	0	1
2	abdon-prats	Abdón Prats	RCD Mallorca	Spain	17.12.1992	181.0	79.0	NaN	1	0	1	0
3	pablo-mari	Pablo Marí	RCD Mallorca	Spain	31.08.1993	191.0	87.0	Center Back	1	1	0	0
4	ruben-pena	Rubén Peña	Real Valladolid	Spain	18.07.1991	172.0	70.0	Right Midfielder	1	1	0	0
5	aaron-hughes	Aaron Hughes	Fulham FC	England	08.11.1979	182.0	71.0	Center Back	1	0	0	1
6	aleksandar-kolarov	Aleksandar Kolarov	Manchester City	England	10.11.1985	187.0	80.0	Left Fullback	1	1	0	0
7	alexander-tettey	Alexander Tettey	Norwich City	England	04.04.1986	180.0	68.0	Defensive Midfielder	1	0	0	1
8	anders-lindegaard	Anders Lindegaard	Manchester United	England	13.04.1984	193.0	80.0	Goalkeeper	1	0	1	0
9	andreas-beck	Andreas Beck	1899 Hoffenheim	Germany	13.03.1987	180.0	70.0	Right Fullback	1	1	0	0
10	antonio-rukavina	Antonio Rukavina	Real Valladolid	Spain	26.01.1984	177.0	74.0	Right Fullback	2	2	0	0

	yellowCards	photoID	rater1	rater2	refNum	refCountry	Alpha_3	meanIAT	nIAT	seIAT	meanExp	nExp	seExp
0	0	95212.jpg	0.25	0.50	1	1	GRC	0.326391	712.0	0.000564	0.396000	750.0	0.002696
1	1	1663.jpg	0.75	0.75	2	2	ZMB	0.203375	40.0	0.010875	-0.204082	49.0	0.061504
2	1	NaN	NaN	NaN	3	3	ESP	0.369894	1785.0	0.000229	0.588297	1897.0	0.001002
3	0	NaN	NaN	NaN	3	3	ESP	0.369894	1785.0	0.000229	0.588297	1897.0	0.001002
4	0	NaN	NaN	NaN	3	3	ESP	0.369894	1785.0	0.000229	0.588297	1897.0	0.001002
5	0	3868.jpg	0.25	0.00	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752
6	0	47704.jpg	0.00	0.25	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752
7	0	22356.jpg	1.00	1.00	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752
8	0	16528.jpg	0.25	0.25	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752
9	0	36499.jpg	0.00	0.00	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752
10	1	59786.jpg	0.00	0.00	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752

	playerShort	position	games	yellowCards	yellowReds	redCards	skinColor
0	aaron-hughes	Center Back	654	19	0	0	0.125
1	aaron-hunt	Attacking Midfielder	336	42	0	1	0.125
2	aaron-lennon	Right Midfielder	412	11	0	0	0.250
3	aaron-ramsey	Center Midfielder	260	31	0	1	0.000
4	abdelhamid-el-kaoutari	Center Back	124	8	4	2	0.250
5	abdou-traore_2	Right Midfielder	97	11	1	0	0.750
6	abdoulaye-diallo_2	Goalkeeper	24	0	0	0	0.875
7	abdoulaye-keita_2	Goalkeeper	3	0	0	0	0.875
8	abdoulwhaid-sissoko	Defensive Midfielder	121	21	0	2	1.000
9	abdul-rahman-baba	Left Fullback	50	3	0	1	0.875
10	abdul-razak	Center Midfielder	36	2	0	0	1.000
11	abel-aguilar	Defensive Midfielder	246	70	3	2	0.375
12	abou-diaby	Center Midfielder	208	24	0	2	0.750
13	adam-bodzek	Defensive Midfielder	211	66	1	0	0.250
14	adam-federici	Goalkeeper	206	4	0	0	0.000
15	adam-hlousek	Left Midfielder	131	21	0	0	0.000
16	adam-johnson	Left Midfielder	339	21	0	0	0.000
17	adam-pinter	Center Back	82	14	1	1	0.000
18	adam-smith_3	Right Fullback	131	29	2	1	0.000
19	adam-szalai	Center Forward	182	18	1	0	0.250
20	adan	Goalkeeper	44	1	0	1	0.000
21	adel-taarabt	Left Midfielder	209	23	1	0	0.250
22	adil-rami	Center Back	286	59	2	3	0.125
23	adrian-colunga	Center Forward	178	22	1	0	0.250
24	adrian-mutu	Left Winger	453	90	2	3	0.250
25	adrian_2	Center Forward	294	16	0	0	0.125
26	adrian_7	Goalkeeper	29	3	0	0	0.250
27	adriano_24	Left Midfielder	373	58	1	3	0.250
28	adrien-rabiot	Defensive Midfielder	66	11	0	0	0.000
29	aduriz	Center Forward	322	68	3	4	0.250
...	...	...	...	...	...	...	...
1403	xherdan-shaqiri	Left Midfielder	224	23	0	1	0.250
1404	xisco_2	Center Forward	166	26	0	0	0.250
1405	yacine-brahimi	Attacking Midfielder	149	20	2	0	0.625
1406	yann-mvila	Defensive Midfielder	203	29	1	1	0.500
1407	yannick-djalo	Center Forward	167	10	0	0	0.750
1408	yannik-schulze	Center Back	49	8	0	0	0.000
1409	yassine-el-ghanassi	Left Winger	181	15	0	0	0.500
1410	yassine-jebbour	Left Fullback	54	2	1	2	0.625
1411	yaya-toure	Defensive Midfielder	448	82	0	1	1.000
1412	yoan-gouffran	Right Winger	269	18	0	1	0.500
1413	yoann-gourcuff	Attacking Midfielder	341	31	0	2	0.125
1414	yohan-cabaye	Defensive Midfielder	372	86	1	1	0.000
1415	yohandry-orozco	Attacking Midfielder	67	4	0	0	0.500
1416	yossi-benayoun	Attacking Midfielder	456	31	1	0	0.250
1417	younes-belhanda	Attacking Midfielder	173	32	2	3	0.250
1418	younes-kaboul	Center Back	252	25	4	2	0.500
1419	youssef-el-arabi	Center Forward	159	15	0	0	0.625
1420	yunus-malli	Attacking Midfielder	131	4	0	0	0.125
1421	zdenk-pospch	Right Fullback	297	26	1	0	0.125
1422	zdravko-kuzmanovic	Defensive Midfielder	339	42	1	1	0.000
1423	ze-castro	Center Back	169	28	1	1	0.250
1424	zhi-gin-lam	Right Fullback	127	8	0	0	0.250
1425	zlatan-alomerovic	Goalkeeper	111	10	0	1	0.000
1426	zlatan-ibrahimovic	Center Forward	607	94	4	6	0.250
1427	zlatko-junuzovic	Attacking Midfielder	361	53	1	0	0.125
1428	zoltan-gera	Left Winger	392	44	1	1	0.250
1429	zoltan-stieber	Left Midfielder	142	12	0	0	0.000
1430	zoumana-camara	Center Back	395	46	2	6	0.875
1431	zubikarai	Goalkeeper	47	2	0	2	0.000
1432	zurutuza	Defensive Midfielder	160	22	0	0	0.000